# Load packages
library(GAMBLR.open)
library(tidyverse)
# Set a custom GAMBLR theme
theme_set(theme_Morons())
# Custom ggplot function that always adds consistent colors and sets scales
ggplot_consistent <- function(...) {
ggplot(...) +
scale_fill_manual(values = get_gambl_colours()) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)))
}
# Example gene
my_genes <- c("MYC")Tutorial: getting started
This is a quick tour of some basic commands and usage patterns, just to get you started.
This tutorial explores how to retrieve different data types bundled within GAMBLR.data. Commonly, GAMBLR functions are prefixed with get_. These functions are readily available for returning data of different types: Simple Somatic Mutations (SSM), Copy Number (CN) segments and Structural Variants (SV). This resource explores commonly occurring arguments across different functions, best-practices and recommendations in the scope of retrieving data.
How do I obtain metadata?
First, let’s start with retrieving metadata for all GAMBL samples. We can control which samples to be included in the output with seq_type_filter argument, which returns genome samples by default. To return metadata for capture samples, set seq_type_filter = "capture". It is also possible to return metadata for more than one seq type, e.g seq_type_filter = c("genome", "capture").
metadata <- list()
# Get gambl metadata for genome samples
metadata$genomes <- get_gambl_metadata(
seq_type_filter = "genome"
)
metadata$capture <- get_gambl_metadata(
seq_type_filter = "capture"
)
metadata$all <- get_gambl_metadata(
seq_type_filter = c("genome", "capture")
)
metadata$cell_lines <- get_gambl_metadata() %>%
filter(
cohort == "DLBCL_cell_lines"
)Now that we have the metadata, we can look at the expected column names and their format:
str(metadata$all)'data.frame': 2863 obs. of 30 variables:
$ age_group : chr "Other" "Other" "Other" "Other" ...
$ bam_available : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ cohort : chr "BL_cell_lines" "BL_cell_lines" "BL_cell_lines" "BL_cell_lines" ...
$ compression : chr "cram" "cram" "cram" "cram" ...
$ COO_consensus : chr NA NA NA NA ...
$ DHITsig_consensus : chr "NA" "NA" "NA" "NA" ...
$ EBV_status_inf : chr "EBV-positive" "EBV-negative" "EBV-negative" "EBV-negative" ...
$ ffpe_or_frozen : chr "frozen" "frozen" "frozen" "frozen" ...
$ fl_grade : chr NA NA NA NA ...
$ genetic_subgroup : chr "DGG-BL" "DGG-BL" "IC-BL" "IC-BL" ...
$ genome_build : chr "hg38" "hg38" "hg38" "hg38" ...
$ hiv_status : chr NA NA NA NA ...
$ lymphgen : chr "ST2" "Other" "Other" "Other" ...
$ lymphgen_cnv_noA53 : chr "ST2" "Other" "Other" "Other" ...
$ lymphgen_no_cnv : chr "ST2" "Other" "Other" "Other" ...
$ lymphgen_with_cnv : chr "ST2" "Other" "Other" "Other" ...
$ lymphgen_wright : chr NA NA NA NA ...
$ molecular_BL : chr NA NA NA NA ...
$ normal_sample_id : chr NA NA NA NA ...
$ pairing_status : chr "unmatched" "unmatched" "unmatched" "unmatched" ...
$ pathology : chr "BL" "BL" "BL" "BL" ...
$ pathology_rank : num 7 7 7 7 7 7 7 7 7 7 ...
$ patient_id : chr "BLGSP-71-29-00539" "BLGSP-71-29-00525" "BLGSP-71-29-00528" "BLGSP-71-29-00526" ...
$ reference_PMID : num 36201743 36201743 36201743 36201743 36201743 ...
$ sample_id : chr "Akata" "BL2" "BL30" "BL41" ...
$ seq_type : chr "genome" "genome" "genome" "genome" ...
$ sex : chr "NA" "NA" "NA" "NA" ...
$ study : chr "BL_Thomas" "BL_Thomas" "BL_Thomas" "BL_Thomas" ...
$ time_point : chr "A" "A" "A" "A" ...
$ Tumor_Sample_Barcode: chr "Akata" "BL2" "BL30" "BL41" ...
We can now use the metadata as we wish. For example, we can visualize the counts of samples per pathology and sequencing type:
# We can see what is included in the metadata
metadata$all %>%
count(pathology, seq_type) %>%
ggplot_consistent(
aes(
x = pathology,
y = n,
fill = pathology
)
) +
geom_bar(stat = "identity") +
facet_wrap(~seq_type, scales = "free") +
geom_text(aes(label=n), size=3.5) +
theme(
axis.text.x = element_text(
angle = 90,
vjust = 0.5,
hjust = 1
)
)# We can also visualize these counts when subset to only DLBCL:
# Subset metadata on a set of samples (samples classified as DLBCL for pathology)
metadata$dlbcl <- metadata$all %>%
filter(pathology == "DLBCL")
metadata$dlbcl %>%
count(pathology, seq_type) %>%
ggplot_consistent(
aes(
x = pathology,
y = n,
fill = pathology
)
) +
geom_bar(stat = "identity") +
facet_wrap(~seq_type) +
geom_text(aes(label=n), size=3.5)How do I obtain SSM?
Based on the information available to you, your application, or your downstream analysis, there are multiple ways to retrieve SSM data. For example, if you know the sample ID and are only interested in looking at SSM results for that particular sample, you can use get_ssm_by_sample. If multiple samples are to be analyzed, get_ssm_by_samples (plural version) is recommended. You can also use patient IDs for retrieving this data, in this case get_ssm_by_patients is available. In addition, you can also restrict SSM calls to specific genomic regions with get_ssm_by_regions or get_ssm_by_region.
Another possibility is to focus on coding mutations only and call get_coding_ssm, this function returns all coding SSMs from the bundled data in maf-like format. If you have an already pre-filtered metadata, the these_samples_metadata argument can be used with all SSM functions to restrict the variants returned to the sample IDs in this data frame, handy!
By Samples
Return SSMs for one or more samples with get_ssm_by_samples. In the example below, we are requesting SSM for the DOHH-2 cell line in two different ways:
Using these_sample_ids
my_sample_id <- "DOHH-2"
# Using the these_samples_id argument
ssm_sample <- get_ssm_by_samples(these_sample_ids = my_sample_id)
# How many mutations do we get back?
dim(ssm_sample)[1] 22089 49
# What columns are available?
colnames(ssm_sample) [1] "Hugo_Symbol" "Entrez_Gene_Id"
[3] "Center" "NCBI_Build"
[5] "Chromosome" "Start_Position"
[7] "End_Position" "Strand"
[9] "Variant_Classification" "Variant_Type"
[11] "Reference_Allele" "Tumor_Seq_Allele1"
[13] "Tumor_Seq_Allele2" "dbSNP_RS"
[15] "dbSNP_Val_Status" "Tumor_Sample_Barcode"
[17] "Matched_Norm_Sample_Barcode" "Match_Norm_Seq_Allele1"
[19] "Match_Norm_Seq_Allele2" "Tumor_Validation_Allele1"
[21] "Tumor_Validation_Allele2" "Match_Norm_Validation_Allele1"
[23] "Match_Norm_Validation_Allele2" "Verification_Status"
[25] "Validation_Status" "Mutation_Status"
[27] "Sequencing_Phase" "Sequence_Source"
[29] "Validation_Method" "Score"
[31] "BAM_File" "Sequencer"
[33] "Tumor_Sample_UUID" "Matched_Norm_Sample_UUID"
[35] "HGVSc" "HGVSp"
[37] "HGVSp_Short" "Transcript_ID"
[39] "Exon_Number" "t_depth"
[41] "t_ref_count" "t_alt_count"
[43] "n_depth" "n_ref_count"
[45] "n_alt_count" "RefSeq"
[47] "Pipeline" "Study"
[49] "maf_seq_type"
# What variants are available?
ssm_sample %>%
count(Variant_Classification) %>%
ggplot_consistent(
aes(
x = Variant_Classification,
y = n,
fill = Variant_Classification
)
) +
geom_bar(stat = "identity") +
geom_text(aes(label=n), size=3.5, vjust = -0.5) +
theme(
axis.text.x = element_text(
angle = 90,
vjust = 0.5,
hjust = 1
)
)Using these_samples_metadata
We can supply instead a metadata table that has already been subset to the sample ID(s) of interest.
metadata$dohh2 <- metadata$genome %>%
filter(sample_id == "DOHH-2")
# Using the these_samples_metadata argument
ssm_meta <- get_ssm_by_samples(
these_samples_metadata = metadata$dohh2
)
# How many mutations do we get back?
dim(ssm_meta)[1] 22089 49
# What columns are available?
colnames(ssm_meta) [1] "Hugo_Symbol" "Entrez_Gene_Id"
[3] "Center" "NCBI_Build"
[5] "Chromosome" "Start_Position"
[7] "End_Position" "Strand"
[9] "Variant_Classification" "Variant_Type"
[11] "Reference_Allele" "Tumor_Seq_Allele1"
[13] "Tumor_Seq_Allele2" "dbSNP_RS"
[15] "dbSNP_Val_Status" "Tumor_Sample_Barcode"
[17] "Matched_Norm_Sample_Barcode" "Match_Norm_Seq_Allele1"
[19] "Match_Norm_Seq_Allele2" "Tumor_Validation_Allele1"
[21] "Tumor_Validation_Allele2" "Match_Norm_Validation_Allele1"
[23] "Match_Norm_Validation_Allele2" "Verification_Status"
[25] "Validation_Status" "Mutation_Status"
[27] "Sequencing_Phase" "Sequence_Source"
[29] "Validation_Method" "Score"
[31] "BAM_File" "Sequencer"
[33] "Tumor_Sample_UUID" "Matched_Norm_Sample_UUID"
[35] "HGVSc" "HGVSp"
[37] "HGVSp_Short" "Transcript_ID"
[39] "Exon_Number" "t_depth"
[41] "t_ref_count" "t_alt_count"
[43] "n_depth" "n_ref_count"
[45] "n_alt_count" "RefSeq"
[47] "Pipeline" "Study"
[49] "maf_seq_type"
# What variants are available?
ssm_meta %>%
count(Variant_Classification) %>%
ggplot_consistent(
aes(
x = Variant_Classification,
y = n,
fill = Variant_Classification
)
) +
geom_bar(stat = "identity") +
geom_text(aes(label=n), size=3.5, vjust = -0.5) +
theme(
axis.text.x = element_text(
angle = 90,
vjust = 0.5,
hjust = 1
)
)We can make sure that both approaches generate identical outputs:
identical(
ssm_sample,
ssm_meta
)[1] TRUE
Thus, there is no “right” or “wrong” way, it is simply your personal preference!
In a different projection
Often many downstream tools can only work on one specific genome build, and GAMBLR.data provides a simple and straightforward way to obtain variants in different projections. The default output is always with respect to grch37, and it can be easily modified with argument projection:
ssm_hg38 <- get_ssm_by_samples(
projection = "hg38"
)
# Sanity check the projection
ssm_hg38 %>%
count(NCBI_Build) %>%
ggplot_consistent(
aes(
x = NCBI_Build,
y = n,
fill = NCBI_Build
)
) +
geom_bar(stat = "identity") +
geom_text(aes(label=n), size=3.5, vjust = -0.5)As we did not specify any sample ID, metadata, or gene to the above call, it returned the data for all samples available in GAMBLR.data, and we can see from the plot that all of the variants are with respect to hg38. Sweet! 😎
By Region
In this section, we are exploring the different ways you can obtain the maf data for a specific region (or regions) of interest.
In this example, we will obtain SSM calls for all aSHM regions associated with PAX5 across all available samples. With this function we also will get the region name added to the returned data frame. If you are providing regions as a bed file (regions_bed), you have the option of setting use_name_column = TRUE. If you do so, your bed file should have a column simply named “name”. In this case, the function will keep this column for naming the returned regions in the maf. With streamlined = TRUE the function returns the minimal number of columns.
Don’t know coordinates of aSHM at PAX5? GAMBLR.data has you covered!
# Get aSHM genes, select the columns of interest and rename for
# get_ssm_by_regions compatibility
ashm_gene <- "PAX5"
regions <- create_bed_data(
GAMBLR.data::grch37_ashm_regions %>%
filter(gene == ashm_gene),
fix_names = "concat",
concat_cols = c("gene","region"),
sep="-"
)
# Get ssm for all ashm regions
ashm_ssm <- get_ssm_by_regions(
these_samples_metadata = metadata$genomes,
regions_bed = regions,
streamlined = FALSE
)
dim(ashm_ssm)[1] 3238 50
ashm_ssm[1:5,1:10]genomic_data Object
Genome Build:
Showing first 10 rows:
Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position
1 PAX5 0 . GRCh37 9 37033520
2 PAX5 0 . GRCh37 9 37034762
3 PAX5 0 . GRCh37 9 37033894
4 PAX5 0 . GRCh37 9 37035326
5 PAX5 0 . GRCh37 9 37030604
End_Position Strand Variant_Classification Variant_Type
1 37033520 + Intron SNP
2 37034762 + 5'Flank SNP
3 37033894 + Intron SNP
4 37035326 + 5'Flank SNP
5 37030604 + Intron SNP
Coding SSM
Lastly, another way to retrieve SSM is to call get_coding_ssm. This function returns coding SSM for any given sample. This function is a convenient option for anyone interested in focusing only on coding mutations. Convenient filtering arguments are included in this function for easy and straightforward subsetting. If these arguments are not used, coding SSM will be returned for all samples. Of course, similar to the examples above, you can provide a metadata subset that has already been filtered to the sample IDs of interest (using these_samples_metadata).
# All default arguments
all_ssm <- get_coding_ssm()
dim(all_ssm)[1] 13913 49
# Provide metadata
dlbcl_cell_lines <- get_coding_ssm(
these_samples_metadata = metadata$cell_lines
)
dim(dlbcl_cell_lines)[1] 1616 49
How do I obtain CNV?
For the purpose of retrieving CN data, we have the function get_sn_segments. If you want to query CN calls for a specific region or genomic loci, get_cn_segments is best used. In this section we will demonstrate how this function can be used.
Using these_samples_metadata
We can use metadata restricted to the sample ID of interest to demonstrate what type of data will be returned:
seg <- get_cn_segments(
these_samples_metadata = metadata$cell_lines
)
# What are the columns we have available?
head(seg)SEG Data Object
Genome Build: grch37
Showing first 10 rows:
ID chrom start end LOH_flag log.ratio CN
1 DOHH-2 1 10001 86026719 0 0 2
2 DOHH-2 1 86026720 86688464 1 0 2
3 DOHH-2 1 86688465 121499999 0 0 2
4 DOHH-2 1 142600001 249250620 0 0 2
5 DOHH-2 2 10001 89087285 0 0 2
6 DOHH-2 2 89087286 89997184 1 -10 0
In a different projection
We can retrieve CN segments while also requesting a different projection. Similar to the SSM functionality shown earlier, this can be done by toggling the projection argument and switching it to the hg38 value.
# Call in hg38 projection
seg <- get_cn_segments(
these_samples_metadata = metadata$cell_lines,
projection = "hg38"
)
head(seg)SEG Data Object
Genome Build: hg38
Showing first 10 rows:
ID chrom start end LOH_flag log.ratio CN
1 DOHH-2 chr1 10001 85561036 0 0 2
2 DOHH-2 chr1 85561037 86222781 1 0 2
3 DOHH-2 chr1 86222782 121699999 0 0 2
4 DOHH-2 chr1 143200001 248956421 0 0 2
5 DOHH-2 chr2 1 91799999 0 0 2
6 DOHH-2 chr2 96000001 242193528 0 0 2
How do I obtain SV?
In this last section, we will explore how to get SV data using GAMBLR.data. For this purpose get_manta_sv was developed. This function can restrict the returned calls to any genomic regions specified with chromosome, qstart, qend, or the complete region specified with the region argument (in chr:start-end format). In addition, useful filtering arguments are also available, use min_vaf to set the minimum tumour VAF in order for a SV to be returned and min_score to set the lowest Manta somatic score in order for a SV to be returned. pair_status can be used to obtain variants from either matched or unmatched samples.
Default behaviour
We will call get_manta_sv with default arguments to examine the output.
# Default arguments
all_manta <- get_manta_sv()
# How many SVs do we get back?
nrow(all_manta)[1] 1154
# How many samples do we have SV calls for?
length(unique(all_manta$tumour_sample_id))[1] 580
# What does the returned data frame look like?
head(all_manta) CHROM_A START_A END_A CHROM_B START_B END_B
1 1 161658631 161658631 3 16509907 16509907
2 1 161663959 161663959 9 37363320 37363320
3 1 161663959 161663959 9 37363320 37363320
4 11 65267283 65267283 14 106110907 106110907
5 11 65267422 65267422 14 106110905 106110905
6 13 91976545 91976545 14 106211857 106211857
manta_name SCORE STRAND_A STRAND_B tumour_sample_id
1 MantaBND:21171:0:1:0:0:0 133 + + FL2002T1
2 MantaBND:206628:0:1:0:0:0 122 + + 09-15842_tumorA
3 MantaBND:195941:0:1:0:0:0 151 + + 09-15842_tumorB
4 MantaBND:152220:0:1:0:0:0:0 88 + - 15-38154T
5 MantaBND:152220:0:1:0:0:0:0 135 - + 15-38154T
6 MantaBND:18:59794:59817:0:1:0 90 - + 15-31924T
normal_sample_id VAF_tumour DP pair_status FILTER
1 FL2002N 0.331 127 matched PASS
2 09-15842_normal 0.281 196 matched PASS
3 09-15842_normal 0.364 187 matched PASS
4 15-38154N 0.150 167 matched PASS
5 15-38154N 0.290 169 matched PASS
6 15-31924N 0.365 85 matched PASS
Using these_samples_metadata example
Alternatively, you can also provide these_samples_metadata argument to make use of a pre-filtered metadata table. In this case, the returned SVs will be restricted to the sample_ids within the data frame.
cell_line_manta <- get_manta_sv(
these_samples_metadata = metadata$cell_lines
)
# How many SVs do we get back?
nrow(cell_line_manta)[1] 13
# How many samples do we have SV calls for?
length(unique(cell_line_manta$tumour_sample_id))[1] 3
# What does the returned data frame look like?
head(cell_line_manta) CHROM_A START_A END_A CHROM_B START_B END_B
1 14 106329462 106329462 18 60774579 60774579
2 14 106329465 106329465 18 60793497 60793497
3 14 106330466 106330466 18 60793914 60793914
4 14 106349765 106349765 18 60793914 60793914
5 14 106379091 106379091 18 60793492 60793492
6 14 106380227 106380227 18 60774578 60774578
manta_name SCORE STRAND_A STRAND_B tumour_sample_id
1 MantaBND:220769:1:2:0:0:0 134 + - SU-DHL-10
2 MantaBND:194451:1:2:0:0:0 103 + - DOHH-2
3 MantaBND:217561:1:2:0:0:0 182 + - SU-DHL-4
4 MantaBND:217561:0:1:0:0:0 198 - + SU-DHL-4
5 MantaBND:194451:0:1:0:0:0 91 - + DOHH-2
6 MantaBND:220769:0:1:0:0:0 169 - + SU-DHL-10
normal_sample_id VAF_tumour DP pair_status FILTER
1 14-11247N 0.318 66 unmatched PASS
2 14-11247N 0.290 69 unmatched PASS
3 14-11247N 0.474 57 unmatched PASS
4 14-11247N 0.500 62 unmatched PASS
5 14-11247N 0.300 60 unmatched PASS
6 14-11247N 0.578 45 unmatched PASS
By Region
We can call get_manta_sv specifying the region of interest first in the region format and then with specifying the chromosome, start and end individually.
Don’t know coordinates of MYC gene region? GAMBLR got you covered!
# Convert the gene name to the region
myc_region <- gene_to_region(
gene_symbol = "MYC"
)
myc_region MYC
"8:128747680-128753674"
# Specifying MYC in region format
myc_manta_region <- get_manta_sv(
these_samples_metadata = metadata$genomes,
region = myc_region,
min_vaf = 0,
min_score = 0
)
# Specifying MYC with chromosome, qstart and qend arguments
myc_manta_chunks <- get_manta_sv(
these_samples_metadata = metadata$genomes,
chromosome = 8,
qstart = 128747680,
qend = 128753674,
min_vaf = 0,
min_score = 0
)
# Are the returned data frames the same?
identical(
myc_manta_region,
myc_manta_chunks
)[1] TRUE
SV filtering
Here we are demonstrating the filtering options to obtain SVs. In this example, we are calling get_manta_sv on the DLBCL metadata subset. For demonstration purposes, we are also requesting a non-default projection and adding some more filtering.
# Get manta SVs for the samples with DLBCL pathology
dlbcl_manta <- get_manta_sv(
these_samples_metadata = metadata$dlbcl,
projection = "hg38",
min_vaf = 0.4,
min_score = 100
)
# How many variants do we get back with these filters?
nrow(dlbcl_manta)[1] 68
# Does the advertised VAF filters work?
all(dlbcl_manta$VAF_tumour >= 0.4)[1] TRUE
# Do the advertised SCORE filter work?
all(dlbcl_manta$SCORE >= 100)[1] TRUE
# What does the returned data frame look like?
head(cell_line_manta) CHROM_A START_A END_A CHROM_B START_B END_B
1 14 106329462 106329462 18 60774579 60774579
2 14 106329465 106329465 18 60793497 60793497
3 14 106330466 106330466 18 60793914 60793914
4 14 106349765 106349765 18 60793914 60793914
5 14 106379091 106379091 18 60793492 60793492
6 14 106380227 106380227 18 60774578 60774578
manta_name SCORE STRAND_A STRAND_B tumour_sample_id
1 MantaBND:220769:1:2:0:0:0 134 + - SU-DHL-10
2 MantaBND:194451:1:2:0:0:0 103 + - DOHH-2
3 MantaBND:217561:1:2:0:0:0 182 + - SU-DHL-4
4 MantaBND:217561:0:1:0:0:0 198 - + SU-DHL-4
5 MantaBND:194451:0:1:0:0:0 91 - + DOHH-2
6 MantaBND:220769:0:1:0:0:0 169 - + SU-DHL-10
normal_sample_id VAF_tumour DP pair_status FILTER
1 14-11247N 0.318 66 unmatched PASS
2 14-11247N 0.290 69 unmatched PASS
3 14-11247N 0.474 57 unmatched PASS
4 14-11247N 0.500 62 unmatched PASS
5 14-11247N 0.300 60 unmatched PASS
6 14-11247N 0.578 45 unmatched PASS